library(NbClust)
library(fastcluster)
library(FactoMineR)

source("longTAPIO.R")
source("longTAPIO_try.R")
source("TAPIO.R")
source("calc_SIL.R")
source("association.R")

Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

Univariate data

Three diverging lines (kml)

set.seed(123)
ex2 <- kml::generateArtificialLongData(
   meanTrajectories=list(function(t)0,function(t)-t,function(t)t),
   nbEachClusters=c(50,50,50),
   residualVariation=function(t){rnorm(1,0,0.35)}
)

trueClusIDs = rep(1:3,each=50)
#plot(ex2,parTraj=parTRAJ(col=rep(2:4,each=50)))
x =attr(ex2, "traj") #columns: time, rows: subjects
user_ids = rep(1:nrow(x), each = ncol(x))

matplot(t(x),type="l",lty=1, col= trueClusIDs) ;grid()     

kml inspired distance matrix

res = longTAPIO_try(matrix(as.vector(t(x)),ncol=1), k = 3, user_id =user_ids, levels=3, verbose = 1)
## data dimension: 1650 1
foundClusIDs = res$cl
#confusion matrix
table(foundClusIDs, trueClusIDs)
##             trueClusIDs
## foundClusIDs  1  2  3
##            1 50  0  0
##            2  0 50  0
##            3  0  0 50

row sampling

#compare to row sampliung:
DATA = matrix(as.vector(t(x)),ncol=1)
rownames(DATA) = user_ids
res2 = longTAPIO(DATA, k = 3, levels=3)

foundClusIDs = res2$cl
#confusion matrix (well within the ambiguity of renumbering)
table(foundClusIDs, trueClusIDs)
##             trueClusIDs
## foundClusIDs  1  2  3
##            1 47  0  0
##            2  3  0 50
##            3  0 50  0

iris

kml inspired distance matrix

res = longTAPIO_try(D_norm, user_id = rownames(D_norm), k=3, n_trees=1000, levels=3)
trueClusIDs = aggregate(as.numeric(outcome),FUN= Mode,by = list(rownames(D_norm)))[,2]
foundClusIDs = res$cl
#confusion matrix (well within the ambiguity of renumbering)
table(foundClusIDs, trueClusIDs)
##             trueClusIDs
## foundClusIDs 1 2 3
##            1 3 0 0
##            2 0 0 3
##            3 0 4 0
#sample size seems too low for any conclusion

row sampling

res = longTAPIO(D_norm, k=3, n_trees=1000, levels=3)
trueClusIDs = aggregate(as.numeric(outcome),FUN= Mode,by = list(rownames(D_norm)))[,2]
foundClusIDs = res$cl
#confusion matrix (well within the ambiguity of renumbering)
table(foundClusIDs, trueClusIDs)
##             trueClusIDs
## foundClusIDs 1 2 3
##            1 2 0 1
##            2 1 3 0
##            3 0 1 2
#sample size seems too low for any conclusion

Multiple dimensions! (from clusterMLD)

p = "data/"
load(paste0(p, "Longdat_clusters.rda"))
clusterMLD::MeanPlot(output)

Interpolated Data at fixed times:

load("data/LongDat_interpolated.rda")

kml inspired distance matrix

res = longTAPIO_try(as.matrix(y_int[,1:5]), k = 4, user_id = y_int$id, levels=4, verbose = 1)
## data dimension: 2000 5
trueClusIDs = aggregate(y_int$label, function(x) return(x[1]), by = list(y_int$id))[,2]
foundClusIDs = res$cl
#confusion matrix
table(foundClusIDs, trueClusIDs)
##             trueClusIDs
## foundClusIDs  1  2  3  4
##            1  5  5  0  0
##            2  0 60  0  0
##            3  0  0 65  0
##            4  0  0  0 65

row sampling

#compare to row sampliung:
DATA = as.matrix(y_int[,1:5])
rownames(DATA) = y_int$id
res2 = longTAPIO(DATA, k = 4, levels=4)

foundClusIDs = res2$cl
#confusion matrix (well within the ambiguity of renumbering)
table(foundClusIDs, trueClusIDs)
##             trueClusIDs
## foundClusIDs  1  2  3  4
##            1  5 20  0  0
##            2  0 45  7  0
##            3  0  0 57  0
##            4  0  0  1 65